import scipy.io as scio
from shutil import copyfile
import os 
import random
import itertools
import numpy as np
import re
import shutil  

shutil.rmtree('./synthtext/')  
os.mkdir('./synthtext/') 

num_image = 1000
data_path = '/home/kxc/disk_2t/kxc/paper_exper_text_data/Dataset/Dataset/SynthText' #syn_8000/english1'
gt = scio.loadmat(os.path.join(data_path, 'gt.mat'))
print(gt.keys())

charbox = gt['charBB'][0]
wordbox = gt['wordBB'][0]
imgtxt = gt['txt'][0]
k_list = list(gt['imnames'][0])
gt_len = len(k_list)
target = "./synthtext/"
box_info = {}
word_info = {}
gt_sample = []
print("------------", gt_len, "-------------")
fp = open("./synthtext/genSceneSynth_log.txt", 'w')
sel_image = set()
for index in range(num_image):
    randIndex = int(random.uniform(0, gt_len))
    fp.write("\n\n************" + str(index)+"/"+str(randIndex)+" "+str(k_list[randIndex][0]))    
    print("************", index, "/", randIndex, " ", k_list[randIndex][0])

    img_path = os.path.join(data_path, k_list[randIndex][0])
    img_name = img_path.split('/')[-1]
    if img_name in sel_image:
        continue
    sel_image.add(img_name) 
    words = [re.split(' \n|\n |\n| ', t.strip()) for t in imgtxt[randIndex]]
    words = list(itertools.chain(*words))
    words = [t for t in words if len(t) > 0]
    character_bboxes = []
    total = 0

    _charbox = charbox[randIndex].transpose((2, 1, 0))
    _wordbox = None


    if len(wordbox[randIndex].shape) > 2:
        _wordbox = wordbox[randIndex].transpose((2, 1, 0))
    else:
        _wordbox = wordbox[randIndex].transpose((1, 0))
        _wordbox = _wordbox[np.newaxis, ...]
    fp.write("\n_wordbox/len(words):"+str(_wordbox.shape)+"/"+str(len(words))) 
    print("_wordbox / len(words)", _wordbox.shape, len(words))
    # input()
    assert (_wordbox.shape[0] == len(words))
    if _wordbox.shape[0] != len(words):
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! _wordbox.shape[0] != len(words) ")
        input()
    for i in range(len(words)):
        bboxes = _charbox[total:total + len(words[i])]
        assert (len(bboxes) == len(words[i]))
        total += len(words[i])
        bboxes = np.array(bboxes)
        print(bboxes.shape)
        
        character_bboxes.append(bboxes)

    if _wordbox.shape[0] != len(character_bboxes):
        print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! _wordbox.shape[0] != len(character_bboxes) ")
        input()
    fp.write("\n_wordbox/len(character_bboxes):"+str(_wordbox.shape)+"/"+str(len(character_bboxes))) 
    print("_wordbox / len(character_bboxes)", _wordbox.shape, len(character_bboxes))

    box_info[img_name] = character_bboxes
    word_info[img_name] = _wordbox
    copyfile(img_path, target + img_name)

    write_bb = [str(i) for i in character_bboxes]
    fp.write("\ncharacter_bboxes")
    fp.write('\n'.join(write_bb))
    fp.write("\n_wordbox"+str(_wordbox))

fp.close()
savename1='./synthtext/char_info.mat'  #文件路径
savename2='./synthtext/word_info.mat'  #文件路径

scio.savemat(savename1, box_info) 
scio.savemat(savename2, word_info) 


